#################################################################################################
# GOV2001
# Final Papaer
# Martin Saveski and Abdullah Almaatouq
# May 4, 2015
#################################################################################################

setwd("~/Dropbox (MIT)/GOV2001/Product space/extension")

library(cem)
library(Zelig)
library(MatchingFrontier)
library(discretization)
library(infotheo)
library(texreg)
library(Hmisc)

################################################################################################################################################

load_data <- function(start_year, end_year, step, one_pair=F) {  
  vars_to_keep <- c("gdp_ppp", "population", "life_expectency", "years_of_education")
  
  df_all <- data.frame()
  country_features <- read.csv("../data/country_features.csv", header = T)
  eci_all_years <- read.csv("../data/ECI data/eci_all_years.csv", header = T)
  
  # fix years of education
  for(year in 1985:2005) {
    years_of_edu <- country_features[, sprintf("years_of_primary_education_%d", year)] + country_features[, sprintf("years_of_secondary_education_%d", year)]
    country_features[, sprintf("years_of_education_%d", year)] <- years_of_edu
  }
  
  for(year in seq(start_year, (end_year - step), by=step)) {
    cat("years:", sprintf("gdp_ppp_%d", year + step), sprintf("gdp_ppp_%d", year), "\n")
    
    country_features_year <- data.frame(country_features$code)
    names(country_features_year) <- c("code")
    
    for (var in vars_to_keep) {
      f_name <- sprintf("%s_%d", var, year)
      country_features_year[, var] <- country_features[, f_name]
    }
    country_features_year$growth <- (country_features[, sprintf("gdp_ppp_%d", year + step)] / country_features[, sprintf("gdp_ppp_%d", year)])
    country_features_year <- country_features_year[complete.cases(country_features_year), ]
    
    # load the ECI scores for this year
    eci_year <- eci_all_years[eci_all_years$year == year, ]
    
    # merge ECI + Country info
    df_year <- merge(country_features_year, eci_year[, c("code", "eci")])
    
    # append to the full matrix
    df_all <- rbind(df_all, df_year)
    
    # don't load all pairs
    if (one_pair == T) {
      break  
    }
  }
  
  return(df_all)
}

################################################################################################################################################

calculate.fsatt <- function(obj, formula, data, T1, T2, t.var) {
  out <- do.call("lm", list(formula = formula, data = data, weights = obj$w))
  fit <- fitted(out)
  response <- all.vars(formula)[attr(terms(formula), "response")]
  tmp.data <- data
  tmp.data[, obj$treatment] <- T1
  prd <- predict(out, tmp.data)
  TEi <- exp(data[, response]) - exp(prd)
  g <- function(i) {
    # matched + belongs to group T2
    idt <- which(obj$mstrata == i & obj$groups == T2)
    mean(TEi[idt])
  }
  mstrata <- na.omit(unique(obj$mstrata))
  TE <- sapply(mstrata, g)
  names(TE) <- mstrata
  idx <- which(is.na(TE))
  if (length(idx) > 0) 
    TE <- TE[-idx]
  reg.cem <- t(summary(out)$coefficients)
  rownames(reg.cem) <- c("Estimate", "Std. Error", "t value", "p-value")
  att.model <- reg.cem
  
  out <- list(att.model = att.model, tab = obj$tab, 
              treatment =  paste(t.var, T2, sep=""), extrapolate = F, 
              mod.type = "LR", TE = TE)
  class(out) <- "cem.att"
  return(out)
}

################################################################################################################################################

get_breaks <- function(var.num, var.cat) {
  # find cut points
  xc.all <- sort(unique(var.cat))
  cutp <- c()
  for(i in 1:(length(xc.all)-1)) {
    xc <- xc.all[i]
    xc.max <- max(var.num[var.cat == xc])
    cutp <- c(cutp, xc.max)
  }
  
  # add min and max as cut points
  cutp <- c(min(var.num), cutp, max(var.num))  
  return(cutp)
}

################################################################################################################################################

plot_discretization <- function(var.num, cutp, cols) {
  var.den <- density(var.num)
  plot(var.den, yaxt='n', xlab="", ylab="", main="", bty="n")
  
  # add min and max as cut points
  cutp <- c(min(var.den$x), cutp[2:(length(cutp) - 1)], max(var.den$x))  
  for(i in 2:length(cutp)) {
    c.min <- cutp[i - 1]
    c.max <- cutp[i]
    
    ind <- var.den$x > c.min & var.den$x <= c.max
    p.x <- c(var.den$x[ind], rev(var.den$x[ind]))
    p.y <- c(var.den$y[ind], rep(min(var.den$y), sum(ind)))
    
    polygon(p.x, p.y, col=cols[i-1], border=NA)  
  }
}

################################################################################################################################################

df <- load_data(1985, 2005, step=5, one_pair=F)
dim(df)

df$growth <- log(df$growth)
df$gdp_ppp <- log(df$gdp_ppp)
df$population <- log(df$population)

m1 <- lm(growth ~ gdp_ppp + eci, data=df)
sm1 <- summary(m1)

m2 <- lm(growth ~ gdp_ppp + population + life_expectency + years_of_education + eci, data=df)
sm2 <- summary(m2)

#
# DISCRETIZATION 
#

# ECI
df$eci_disc <- discretize(df$eci, disc="equalfreq", nbins=3)$X
df$eci_disc <- factor(df$eci_disc)

breaks <- list()

# gdp_ppp 
tmp <- chi2(df[, c("gdp_ppp", "eci_disc")], 0.0001, 0.1)
breaks$gdp_ppp <- get_breaks(df$gdp_ppp, as.factor(tmp$Disc.data$gdp_ppp))

# population
tmp <- discretize(df$population, disc="equalfreq", nbins=3)$X
breaks$population <- get_breaks(df$population, as.factor(tmp))

# life expectency
tmp <- chi2(df[, c("life_expectency", "eci_disc")], 0.0000001, 0.1)
breaks$life_expectency <- get_breaks(df$life_expectency, as.factor(tmp$Disc.data$life_expectency))

# years of education
tmp <- chi2(df[, c("years_of_education", "eci_disc")], 0.01, 0.1)
breaks$years_of_education <- get_breaks(df$years_of_education, as.factor(tmp$Disc.data$years_of_education))

#
# MATCHING
#
imbl.before <- imbalance(group=df$eci_disc, data=df, drop=c("code", "eci", "eci_disc", "growth"))
cem.match <- cem(treatment="eci_disc", data = df, drop = c("code", "eci", "eci_disc", "growth"), eval.imbalance=T, cutpoints=breaks)
cem.match
balance.gain <- (imbl.before$L1$L1 - cem.match$imbalance$L1$L1) / (imbl.before$L1$L1)

m3 <- lm(growth ~ gdp_ppp + life_expectency + population + years_of_education + eci_disc, data=df, weights = cem.match$w)
sm3 <- summary(m3)

#
# FSATT
#
# 1 -> 2
T1 <- "1"
T2 <- "2"
formula <- growth ~ gdp_ppp + life_expectency + population + years_of_education + eci_disc
out <- calculate.fsatt(cem.match, formula, df, T1, T2, "eci_disc")
mean(out$TE)

# 1 -> 3
T1 <- "1"
T2 <- "3"
formula <- growth ~ gdp_ppp + life_expectency + population + years_of_education + eci_disc
out <- calculate.fsatt(cem.match, formula, df, T1, T2, "eci_disc")
mean(out$TE)

# 2 -> 3
T1 <- "2"
T2 <- "3"
formula <- growth ~ gdp_ppp + life_expectency + population + years_of_education + eci_disc
out <- calculate.fsatt(cem.match, formula, df, T1, T2, "eci_disc")
mean(out$TE)

#
# SIMULATIONS
#
set.seed(0)

# model 1
z5 <- zls$new()
z5$zelig(growth ~ gdp_ppp + eci, data = df)

x_median <- data.frame(list(gdp_ppp = median(df$gdp_ppp), eci = 0))
eci_means <- aggregate(df$eci, by=list(df$eci_disc), mean)$x
N = length(eci_means)
preds1 <- matrix(0, N, 1)
lq1 <- matrix(0, N, 1)
uq1 <- matrix(0, N, 1)

for(i in 1:N) {
  x_median$eci <- eci_means[i]
  z5$setx(x_median)
  z5$sim()  
  est <- z5$sim.out$x$ev[[1]]
  preds1[i, ] <- mean(est)
  lq1[i, ] <- quantile(est, 0.25)
  uq1[i, ] <- quantile(est, 0.75)
}

# model 2
z5 <- zls$new()
z5$zelig(growth ~ gdp_ppp + population + life_expectency + years_of_education + eci, data = df)
x_median <- data.frame(list(gdp_ppp = median(df$gdp_ppp), 
                            population = median(df$population),
                            life_expectency = median(df$life_expectency),
                            years_of_education = median(df$years_of_education),
                            eci = 0))
eci_means <- aggregate(df$eci, by=list(df$eci_disc), mean)$x
N = length(eci_means)
preds2 <- matrix(0, N, 1)
lq2 <- matrix(0, N, 1)
uq2 <- matrix(0, N, 1)

for(i in 1:N) {
  x_median$eci <- eci_means[i]
  z5$setx(x_median)
  z5$sim()  
  est <- z5$sim.out$x$ev[[1]]
  preds2[i, ] <- mean(est)
  lq2[i, ] <- quantile(est, 0.25)
  uq2[i, ] <- quantile(est, 0.75)
}

# model 3
z5 <- zls$new()
z5$zelig(growth ~ gdp_ppp + population + life_expectency + years_of_education + eci, data = df, weights = cem.match$w)

N = length(eci_means)
preds3 <- matrix(0, N, 1)
lq3 <- matrix(0, N, 1)
uq3 <- matrix(0, N, 1)

for(i in 1:N) {
  x_median$eci <- eci_means[i]
  z5$setx(x_median)
  z5$sim()  
  est <- z5$sim.out$x$ev[[1]]
  preds3[i, ] <- mean(est)
  lq3[i, ] <- quantile(est, 0.25)
  uq3[i, ] <- quantile(est, 0.75)
}

# table
te <- cbind(exp(preds1), exp(preds2), exp(preds3))
rownames(te) <- c("low eci", "medium eci", "high eci")
colnames(te) <- c("m1", "m2", "m3")
print(te)

# plot
pdf("figures/matching_sims.pdf", width=7, height=6.5)
x.vals <- c(1, 2, 3)
errbar(x.vals - 0.1, exp(preds1), exp(preds1 + lq1), exp(preds1 - uq1),  xlab="", ylab="Expected Growth (in %)", ylim=c(0.97, 1.75), xlim=c(0.8, 3.2), col="red", errbar.col="red", yaxp  = c(1.0, 1.7, 7), xaxt='n', yaxt='n')
errbar(x.vals, exp(preds2), exp(preds2 + lq2), exp(preds2 - uq2), col="green", errbar.col="green", add=T)
errbar(x.vals + 0.1, exp(preds3), exp(preds3 + lq3), exp(preds3 - uq3), col="blue", errbar.col="blue", add=T)
axis(side=1, at=c(1:3), labels = c("Low ECI", "Medium ECI", "High ECI"))
axis(side=2, at=seq(1.0, 1.7, by=0.1), labels = seq(0, 70, by=10))
legend("topleft", col=c("red","green","blue"), pch=16, lwd=1, legend=c("Simple Model", "With control variables", "After matching"))
#abline(h=1.0, lty=2)
dev.off()

# plot discretization 
cols <- c("#CFF09E", "#79BD9A", "#0B486B")
w <- 7
h <- 2.5
  
pdf("figures/dist_eci.pdf", width=w, height=h)
plot_discretization(df$eci, get_breaks(df$eci, df$eci_disc), cols)
dev.off()

pdf("figures/dist_gdp_ppp.pdf", width=w, height=h)
plot_discretization(df$gdp_ppp, breaks$gdp_ppp, cols)
dev.off()

pdf("figures/dist_population.pdf", width=w, height=h)
plot_discretization(df$population, breaks$population, cols)
dev.off()

pdf("figures/dist_life_expectency.pdf", width=w, height=h)
plot_discretization(df$life_expectency, breaks$life_expectency, cols)
dev.off()

pdf("figures/dist_years_of_education.pdf", width=w, height=h)
plot(as.factor(df$years_of_education), col=c(rep(cols[1], 3), rep(cols[3], 2)), yaxt='n')
dev.off()

# lines plot
pdf("figures/parallel_plot_small.pdf", width=7, height=6.5)
dt <- df[,c("gdp_ppp", "population", "life_expectency", "years_of_education")]
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
dt$gdp_ppp <- normalize(dt$gdp_ppp)
dt$population <- normalize(dt$population)
dt$life_expectency <- normalize(dt$life_expectency)
dt$years_of_education <- normalize(dt$years_of_education)

# make plot
plot(1:4, rep(1, 4), ylim=c(0,1), type="n", yaxt='n', xaxt='n', xlab="", ylab="")
for (i in 1:nrow(dt)) {
  if (cem.match$matched[i] == T) {
    lines(1:4, dt[i,c("gdp_ppp", "population", "life_expectency", "years_of_education")], col="blue", lwd="0.5")
  } 
  else {
    lines(1:4, dt[i,c("gdp_ppp", "population", "life_expectency", "years_of_education")], col="red", lwd="0.4")
  }
}
axis(side=2, at=c(0,1), labels = c("Min", "Max"), las=1)
axis(side=1, at=c(1:4), labels = c("GDP", "Population", "Life expectency", "Years of education"))
abline(v=1:4, col='grey', lwd=1)
dev.off()

# regression tables
models_list <- list()
models_list[[1]] <- m1
models_list[[2]] <- m2
models_list[[3]] <- m3

texreg(models_list, dcolumn=T, booktabs=T, use.packages=F, 
       digits = 3, label = "tab:1", float.pos = "h", scalebox = 1.0, single.row = TRUE,    
       caption = "Regression coefficients for income per capita (SI, Table S6)")

# END